In [ ]:
    
import gensim
from gensim.models.word2vec import Word2Vec
from sklearn.manifold import TSNE
import re
import matplotlib
    
In [ ]:
    
matplotlib.use("TkAgg")
    
In [ ]:
    
import matplotlib.pyplot as plt
%matplotlib inline
    
In [ ]:
    
data_dir = '../data/'
    
In [ ]:
    
macbeth_file = data_dir + 'macbeth.txt'
    
In [ ]:
    
caesar_file = data_dir + 'julius_caesar.txt'
    
In [ ]:
    
stopword_file = data_dir + 'long_stopwords.txt'
    
In [ ]:
    
stop_words = []
with open(stopword_file,'r') as inpFile:
    lines = inpFile.readlines()
    stop_words_temp = map(lambda x : re.sub('\n','',x),lines)
    stop_words = map(lambda x:  re.sub('[^A-Za-z0-9]+', '',x), stop_words_temp)
    
In [ ]:
    
stop_words
    
In [ ]:
    
type(stop_words)
    
In [ ]:
    
def clean(word):
    word = word.strip()
    word = word.lower()
    word = re.sub('[^A-Za-z0-9]+', '', word)
    if word not in stop_words:
        return word
    else:
        return ''
    
In [ ]:
    
clean("king's")
    
In [ ]:
    
clean("they'll")
    
In [ ]:
    
line_count = 0
sentences = []
with open(macbeth_file,'r') as inpFile:
    x = inpFile.readlines()
    for line in x:
        if line is not None or line != '\n':
            words = line.split()
            words = map(lambda x: clean(x), words)
            words = filter(lambda x:True if len(x) > 0 else False, words)
            sentences.append(words)
            
with open(caesar_file,'r') as inpFile:
    x = inpFile.readlines()
    for line in x:
        if line is not None or line != '\n':
            words = line.split()
            words = map(lambda x: clean(x), words)
            words = filter(lambda x:True if len(x) > 0 else False, words)
            sentences.append(words)
    
In [ ]:
    
type(sentences)
    
In [ ]:
    
model = Word2Vec(sentences, window=5, size=500, workers=4, min_count=5)
    
In [ ]:
    
model.vocab
    
In [ ]:
    
labels = []
tokens = []
for word in model.vocab:
    tokens.append(model[word])
    labels.append(word)
    
In [ ]:
    
tsne_model = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
    
In [ ]:
    
new_values = tsne_model.fit_transform(tokens)
    
In [ ]:
    
x = []
y = []
for value in new_values:
    x.append(value[0])
    y.append(value[1])
    
In [ ]:
    
plt.figure(figsize=(16, 12)) 
for i in range(len(x)):
    plt.scatter(x[i],y[i])
    plt.annotate(labels[i],
                 xy=(x[i], y[i]),
                 xytext=(5, 2),
                 textcoords='offset points',
                 ha='right',
                 va='bottom')
plt.show()
    
In [ ]:
    
model.most_similar(positive=['caesar','duncan'],negative=['scotland'])
    
In [ ]:
    
model.most_similar(positive=['caesar','duncan'],negative=['macbeth'])
    
In [ ]:
    
model.most_similar(positive=['caesar','macbeth'],negative=['banquo'])
    
In [ ]:
    
model.most_similar(positive=['rome','scotland'],negative=['banquo'])
    
In [ ]:
    
model.doesnt_match("duncan macbeth scotland banquo".split())
    
In [ ]: